if(!file.exists('./../Data/BrainSpan_raw.RData')){
# Load csvs (Downloaded from 'RNA-Seq Gencode v10 summarized to genes' in http://www.brainspan.org/static/download.html)
datExpr = read.csv('./../Data/BrainSpan_genes/expression_matrix.csv', header=FALSE)
datMeta = read.csv('./../Data/BrainSpan_genes/columns_metadata.csv')
geneInfo = read.csv('./../Data/BrainSpan_genes/rows_metadata.csv')
# Remove index column in datExpr
cols = datExpr %>% colnames
datExpr = datExpr %>% dplyr::select(-V1)
colnames(datExpr) = cols[-length(cols)]
# Make sure rows match
if(!all(rownames(datExpr) == geneInfo$row_num)){
print('Columns in datExpr don\'t match the rows in datMeta!')
}
# Assign row names
rownames(datMeta) = paste0('V', datMeta$column_num)
rownames(datExpr) = geneInfo$ensembl_gene_id
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$structure_acronym)
datMeta$Brain_lobe = 'Other'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('Ocx','V1C')] = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('M1C-S1C','DFC','OFC','VFC','M1C')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('PCx','IPC', 'S1C')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('AMY','MGE','STC','ITC','HIP','TCx','A1C')] = 'Temporal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('MFC')] = 'Limbic'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital','Limbic','Other'))
# Don't correspond to any lobe: URL, DTH, LGE, STR, CB, CBC, MD, CGE
#################################################################################
# INITIAL FILTERING:
# 1 Filter probes with start or end position missing (none)
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Keep only Frontal and Temporal samples to do DEA (filter 301 samples)
to_keep = datMeta$Brain_lobe %in% c('Temporal','Frontal')
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]
datMeta$Brain_lobe = factor(datMeta$Brain_lobe, levels=c('Temporal','Frontal'))
# 3. Filter probes with only zeros as entries (filter 2056 probes)
to_keep = rowSums(datExpr)>0
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
#################################################################################
# Annotate SFARI genes
SFARI_genes = read_csv('./../../Gandal/RNAseq/working_data/SFARI_genes_01-15-2019.csv')
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../../Gandal/RNAseq/working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
save(datExpr, datMeta, datProbes, SFARI_genes, GO_neuronal, file='./../Data/BrainSpan_raw.RData')
rm(gene_names, geneInfo, GO_annotations, mart, cols, getinfo, to_keep)
} else load('./../Data/BrainSpan_raw.RData')
MDS takes too long to calculate distances
There doesn’t seem to be a clear pattern for SFARI scores
Applied log2 transformation to the data before performing pca to help the visualisation: % variance explained by 1PC changed from 60 to 90%. This was probably what was happening with Gandal’s dataset as well
91% of the variance of the probes seems to be explained by their mean level of expression
The second PC seems to reflect the variance of the probe
datExpr_pca = prcomp(log2(datExpr+1), scale.=TRUE)
pca_plot = data.frame('ID'=rownames(datExpr), 'PC1'=datExpr_pca$x[,1], 'PC2'=datExpr_pca$x[,2],
'meanExpr'=rowMeans(log2(datExpr+1)), 'sd' = apply(log2(datExpr+1), 1, sd)) %>%
left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score` = ifelse(is.na(`gene-score`),
ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'),
`gene-score`)) %>%
dplyr::select(ID, PC1, PC2, `gene-score`, meanExpr, sd)
ggplotly(ggplot(pca_plot, aes(PC1, PC2, color=`gene-score`)) + geom_point(alpha=0.2, aes(id=ID)) + theme_minimal() +
xlab(paste0('PC1 (',round(summary(datExpr_pca)$importance[2,1]*100),'%)')) +
ylab(paste0('PC2 (',round(summary(datExpr_pca)$importance[2,2]*100),'%)')) +
scale_color_manual(values=gg_colour_hue(9)))
ggplotly(ggplot(pca_plot, aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.5, aes(id=ID)) + theme_minimal() +
xlab(paste0('PC1 (',round(summary(datExpr_pca)$importance[2,1]*100),'%)')) +
ylab(paste0('PC2 (',round(summary(datExpr_pca)$importance[2,2]*100),'%)')) +
scale_colour_viridis())